Read in data

energy <- read_csv(here("data", "energy.csv"))
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   month = col_character(),
##   res_total = col_double(),
##   ind_total = col_double()
## )

convert month from character to date using tsibble

energy_ts <- energy %>% 
  mutate(date = yearmonth(month)) %>% 
  as_tsibble(key = NULL, index = date)

lets look at our time series data

ggplot(energy_ts, aes(x = date, y = res_total)) +
  geom_line() +
  labs(y = "residential energy consumption \n (Trillion BTU)")

Trends: - increasing overall but slightly decreasing around 2005 - seasonality in peaks - no cyclicality or outliers

Seasonplot:

energy_ts %>% 
  gg_season(y= res_total) +
  theme_minimal() +
  labs( x = "month",
        y= "residential energy consumption (trillion BTU)")
## Warning in NextMethod("["): Incompatible methods (">=.Date", ">=.vctrs_vctr")
## for ">="
## Warning in NextMethod("["): Incompatible methods ("<=.Date", "<=.vctrs_vctr")
## for "<="
## Warning in NextMethod("["): Incompatible methods (">=.Date", ">=.vctrs_vctr")
## for ">="
## Warning in NextMethod("["): Incompatible methods ("<=.Date", "<=.vctrs_vctr")
## for "<="

Trends: - highest in winter months (dec, jan, feb) - peaks back up around jul and aug - increasing over time in general

Subseries plot

energy_ts %>% gg_subseries(res_total)

Trends: - clear seasonailty with increasing trend overtime - peaks in dec, jan, feb confirmed

Decomposition

# Find STL decomposition
dcmp <- energy_ts %>%
  model(STL(res_total ~ season()))

# View the components
components(dcmp)
## # A dable:           538 x 7 [1M]
## # Key:               .model [1]
## # STL Decomposition: res_total = trend + season_year + remainder
##    .model               date res_total trend season_year remainder season_adjust
##    <chr>               <mth>     <dbl> <dbl>       <dbl>     <dbl>         <dbl>
##  1 STL(res_total ~… 1973 Jan     1932. 1257.       684.      -9.21         1248.
##  2 STL(res_total ~… 1973 Feb     1687. 1253.       465.     -31.3          1222.
##  3 STL(res_total ~… 1973 Mar     1497. 1250.       242.       5.67         1255.
##  4 STL(res_total ~… 1973 Apr     1178. 1246.       -62.9     -5.49         1241.
##  5 STL(res_total ~… 1973 May     1015. 1242.      -256.      28.7          1271.
##  6 STL(res_total ~… 1973 Jun      934. 1238.      -313.       8.89         1247.
##  7 STL(res_total ~… 1973 Jul      981. 1234.      -231.     -22.0          1212.
##  8 STL(res_total ~… 1973 Aug     1019. 1231.      -225.      13.7          1244.
##  9 STL(res_total ~… 1973 Sep      957. 1227.      -314.      43.5          1271.
## 10 STL(res_total ~… 1973 Oct      992. 1224.      -271.      39.4          1263.
## # … with 528 more rows
# Visualize the decomposed components
components(dcmp) %>% autoplot() +
  theme_minimal()

## Autocorrelation function (ACF)

energy_ts %>% 
  ACF(res_total) %>% 
  autoplot()

Takeaway: - observations separated by 12 months are the most highly correlated, reflecting strong seasonality we see in all of our other exploratory visualizations.

D. Forecasting by Holt-Winters exponential smoothing

We’ll use multiplicative due to changes in variance over time

# Create the model:
energy_fit <- energy_ts %>%
  model(ets = ETS(res_total ~ season("M")))

# Forecast using the model 10 years into the future:
energy_forecast <- energy_fit %>% 
  forecast(h = "10 years")

# Plot just the forecasted values (with 80 & 95% CIs):
energy_forecast %>% 
  autoplot()

# Or plot it added to the original data:
energy_forecast %>% 
  autoplot(energy_ts)

## Assessing residuals

# Append the predicted values (and residuals) to original energy data
energy_predicted <- broom::augment(energy_fit)

# Use View(energy_predicted) to see the resulting data frame

Now, plot the actual energy values (res_total), and the predicted values (stored as .fitted) atop them:

ggplot(data = energy_predicted) +
  geom_line(aes(x = date, y = res_total)) +
  geom_line(aes(x = date, y = .fitted), color = "red")

## Explore the residuals

ggplot(data = energy_predicted, aes(x = .resid)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Extra forecasting bits

# Fit 3 different forecasting models (ETS, ARIMA, SNAIVE):
energy_fit_multi <- energy_ts %>%
  model(
    ets = ETS(res_total ~ season("M")),
    arima = ARIMA(res_total),
    snaive = SNAIVE(res_total)
  )

# Forecast 3 years into the future (from data end date)
multi_forecast <- energy_fit_multi %>% 
  forecast(h = "3 years")

# Plot the 3 forecasts
multi_forecast %>% 
  autoplot(energy_ts)

# Or just view the forecasts (note the similarity across models):
multi_forecast %>% 
  autoplot()

Part 2: Spatial data wrangling, visualization and a variogram

A. California county outlines (polygons)

ca_counties <- read_sf(here("data","ca_counties","CA_Counties_TIGER2016.shp"))

bit of wrangling

ca_subset <- ca_counties %>% 
  select(NAME, ALAND) %>% 
  rename(county_name = NAME, land_area = ALAND)

Check and set the CRS

Use st_crs() to check the existing CRS for spatial data. We see that this CRS is WGS84 (epsg: 3857).

ca_subset %>% st_crs()
## Coordinate Reference System:
##   User input: WGS 84 / Pseudo-Mercator 
##   wkt:
## PROJCRS["WGS 84 / Pseudo-Mercator",
##     BASEGEOGCRS["WGS 84",
##         DATUM["World Geodetic System 1984",
##             ELLIPSOID["WGS 84",6378137,298.257223563,
##                 LENGTHUNIT["metre",1]]],
##         PRIMEM["Greenwich",0,
##             ANGLEUNIT["degree",0.0174532925199433]],
##         ID["EPSG",4326]],
##     CONVERSION["Popular Visualisation Pseudo-Mercator",
##         METHOD["Popular Visualisation Pseudo Mercator",
##             ID["EPSG",1024]],
##         PARAMETER["Latitude of natural origin",0,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8801]],
##         PARAMETER["Longitude of natural origin",0,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8802]],
##         PARAMETER["False easting",0,
##             LENGTHUNIT["metre",1],
##             ID["EPSG",8806]],
##         PARAMETER["False northing",0,
##             LENGTHUNIT["metre",1],
##             ID["EPSG",8807]]],
##     CS[Cartesian,2],
##         AXIS["easting (X)",east,
##             ORDER[1],
##             LENGTHUNIT["metre",1]],
##         AXIS["northing (Y)",north,
##             ORDER[2],
##             LENGTHUNIT["metre",1]],
##     USAGE[
##         SCOPE["unknown"],
##         AREA["World - 85°S to 85°N"],
##         BBOX[-85.06,-180,85.06,180]],
##     ID["EPSG",3857]]
ggplot(data = ca_subset) +
  geom_sf(aes(fill = land_area), color = "white", size = 0.1) +
  theme_void() +
  scale_fill_gradientn(colors = c("cyan","blue","purple"))

B. Invasive red sesbania records (spatial points)

sesbania <- read_sf(here("data","red_sesbania","ds80.shp"))

# Check the CRS:
sesbania %>% st_crs()
## Coordinate Reference System:
##   User input: Custom 
##   wkt:
## PROJCRS["Custom",
##     BASEGEOGCRS["NAD83",
##         DATUM["North American Datum 1983",
##             ELLIPSOID["GRS 1980",6378137,298.257222101,
##                 LENGTHUNIT["metre",1]],
##             ID["EPSG",6269]],
##         PRIMEM["Greenwich",0,
##             ANGLEUNIT["Degree",0.0174532925199433]]],
##     CONVERSION["unnamed",
##         METHOD["Albers Equal Area",
##             ID["EPSG",9822]],
##         PARAMETER["Latitude of false origin",0,
##             ANGLEUNIT["Degree",0.0174532925199433],
##             ID["EPSG",8821]],
##         PARAMETER["Longitude of false origin",-120,
##             ANGLEUNIT["Degree",0.0174532925199433],
##             ID["EPSG",8822]],
##         PARAMETER["Latitude of 1st standard parallel",34,
##             ANGLEUNIT["Degree",0.0174532925199433],
##             ID["EPSG",8823]],
##         PARAMETER["Latitude of 2nd standard parallel",40.5,
##             ANGLEUNIT["Degree",0.0174532925199433],
##             ID["EPSG",8824]],
##         PARAMETER["Easting at false origin",0,
##             LENGTHUNIT["metre",1],
##             ID["EPSG",8826]],
##         PARAMETER["Northing at false origin",-4000000,
##             LENGTHUNIT["metre",1],
##             ID["EPSG",8827]]],
##     CS[Cartesian,2],
##         AXIS["(E)",east,
##             ORDER[1],
##             LENGTHUNIT["metre",1,
##                 ID["EPSG",9001]]],
##         AXIS["(N)",north,
##             ORDER[2],
##             LENGTHUNIT["metre",1,
##                 ID["EPSG",9001]]]]
## Notice that this CRS is different from the California counties CRS, so we’ll want to update it to match. Use st_transform() to update the CRS:

sesbania <- st_transform(sesbania, 3857)

# Then check it: 
sesbania %>% st_crs()
## Coordinate Reference System:
##   User input: EPSG:3857 
##   wkt:
## PROJCRS["WGS 84 / Pseudo-Mercator",
##     BASEGEOGCRS["WGS 84",
##         DATUM["World Geodetic System 1984",
##             ELLIPSOID["WGS 84",6378137,298.257223563,
##                 LENGTHUNIT["metre",1]]],
##         PRIMEM["Greenwich",0,
##             ANGLEUNIT["degree",0.0174532925199433]],
##         ID["EPSG",4326]],
##     CONVERSION["Popular Visualisation Pseudo-Mercator",
##         METHOD["Popular Visualisation Pseudo Mercator",
##             ID["EPSG",1024]],
##         PARAMETER["Latitude of natural origin",0,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8801]],
##         PARAMETER["Longitude of natural origin",0,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8802]],
##         PARAMETER["False easting",0,
##             LENGTHUNIT["metre",1],
##             ID["EPSG",8806]],
##         PARAMETER["False northing",0,
##             LENGTHUNIT["metre",1],
##             ID["EPSG",8807]]],
##     CS[Cartesian,2],
##         AXIS["easting (X)",east,
##             ORDER[1],
##             LENGTHUNIT["metre",1]],
##         AXIS["northing (Y)",north,
##             ORDER[2],
##             LENGTHUNIT["metre",1]],
##     USAGE[
##         SCOPE["unknown"],
##         AREA["World - 85°S to 85°N"],
##         BBOX[-85.06,-180,85.06,180]],
##     ID["EPSG",3857]]
## now they have the same CRS.

Now plot them together

ggplot() +
  geom_sf(data = ca_subset) +
  geom_sf(data = sesbania, size = 1, color = "red")

## more wrangling - say we want to find the count of red sesbania observed locations in this dataset by county. - How can I go about joining these data so that I can find counts? Don’t worry…st_join() has it covered

ca_sesbania <- ca_subset %>% 
  st_join(sesbania)

# And then we can find counts (note: these are not counts for individual plants, but by record in the dataset) by county:

sesbania_counts <- ca_sesbania %>% 
  count(county_name)

# Then we can plot a chloropleth using the number of records for red sesbania as the fill color (instead of what we used previously, land area):
ggplot(data = sesbania_counts) +
  geom_sf(aes(fill = n), color = "white", size = 0.1) +
  scale_fill_gradientn(colors = c("lightgray","orange","red")) +
  theme_minimal() +
  labs(fill = "Number of S. punicea records")

## Only plot the county with the greatest number of red sesbania records (Solano), and make a map of those locations:

# Subset of sesbania point locations only in Solano County
solano_sesbania <- sesbania %>% 
  filter(COUNTY == "Solano")

# Only keep Solano polygon from California County data
solano <- ca_subset %>% 
  filter(county_name == "Solano")

ggplot() +
  geom_sf(data = solano) +
  geom_sf(data = solano_sesbania)

## Make an interactive map with {tmap}

# Set the viewing mode to "interactive":
tmap_mode(mode = "view")
## tmap mode set to interactive viewing
# Then make a map (with the polygon fill color updated by variable 'land_area', updating the color palette to "BuGn"), then add another shape layer for the sesbania records (added as dots):
tm_shape(ca_subset) +
  tm_fill("land_area", palette = "BuGn") +
  tm_shape(sesbania) +
  tm_dots()

For extra resources see: - https://cran.r-project.org/web/packages/tmap/vignettes/tmap-getstarted.html - https://geocompr.robinlovelace.net/adv-map.html#interactive-maps